# %% import
# pip install chromadb sentence_transformers
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

# %% document loaders
loader = TextLoader('Sample.txt', encoding='utf-8')
documents = loader.load()

# 整个文件被视为一个整体
print(len(documents))  # 1
# %% Document transformers
# 文档切分为小块, 防止一次性发送超过llm限制
text_splitter = CharacterTextSplitter(
    # 每块的大小
    chunk_size=200,
    # 折叠, 下一块的开头会和上一块的结尾重叠
    chunk_overlap=0
)

texts = text_splitter.split_documents(documents)

print(len(texts))  # 19

# %% Text embedding models
import os

# os.environ['OPENAI_API_KEY'] = ""
# os.environ['OPENAI_API_BASE'] = ""
from dotenv import load_dotenv

load_dotenv()

embeddings = OpenAIEmbeddings()

# os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'
# huggingface SentenceTransformerEmbeddings sentence_transformers
# embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
print(embeddings)

# %% Vector stores -> Store and search over embedded data
# Load Embeddings of Text into Chroma
db = Chroma.from_documents(texts, embeddings)  # text -> vector
# Let's have a look at embeddings -Numeric representation
print(db._collection.get(include=['embeddings']))

# %% Retrievers
# Query your data
retriever = db.as_retriever(search_kwargs={"k": 1})  # k:2 -> 返回两个结果
# vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001DE2C751E20> search_kwargs={'k': 1}
print(retriever)

# %% question
# 1
# [Document(page_content='Delhi is the capital of India', metadata={'source': 'Sample.txt'})]
# 如果 k:2 -> 返回两个Document
docs = retriever.get_relevant_documents("What is the capital of india?")
print(docs)
# 2
# [Document(page_content='The Indian rupee is the official currency in the Republic of India. The rupee is subdivided into 100 paise. The issuance of the currency is controlled by the Reserve Bank of India.', metadata={'source': 'Sample.txt'})]
docs = retriever.get_relevant_documents("What is the currency of india?")
print(docs)
